{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d35b8270",
   "metadata": {},
   "outputs": [],
   "source": [
    "from lightgbm import LGBMClassifier\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c407a6a7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 424 entries, 0 to 423\n",
      "Columns: 20229 entries, A1BG to label\n",
      "dtypes: float64(20228), int64(1)\n",
      "memory usage: 65.4 MB\n"
     ]
    }
   ],
   "source": [
    "#데이터를 불러오고 형식 확인\n",
    "\n",
    "df = pd.read_csv('./cancer_dataset_ver1.csv')\n",
    "df.head()\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4e150940",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 339 entries, 126 to 242\n",
      "Columns: 20229 entries, A1BG to label\n",
      "dtypes: float64(20228), int64(1)\n",
      "memory usage: 52.3 MB\n"
     ]
    }
   ],
   "source": [
    "X = df.iloc[:,:-1]\n",
    "y = df.iloc[:,-1]\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)\n",
    "train_dataset = pd.concat([X_train, y_train], axis=1)\n",
    "test_dataset = pd.concat([X_test, y_test], axis=1)\n",
    "train_dataset.head()\n",
    "train_dataset.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7e58dc91",
   "metadata": {},
   "outputs": [],
   "source": [
    "#df1.to_csv(\"./cancer_dataset_ver1.csv\", header =True, index = False)\n",
    "train_dataset.to_csv(\"./cancer_train_dataset.csv\", header = True, index = False)\n",
    "test_dataset.to_csv(\"./cancer_test_dataset.csv\", header = True, index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d36c56ac",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       A1BG      A1CF     A2LD1       A2M     A2ML1    A4GALT     A4GNT  \\\n",
      "0  3.649152  1.745312  3.448880  5.099026  2.572422  2.475223  1.950954   \n",
      "1  3.565131  2.127849  3.757838  6.077721  2.538961  2.980286  1.908825   \n",
      "2  3.701450  2.005312  3.712522  4.369133  2.192373  3.345347  1.924172   \n",
      "3  4.189229  2.188217  4.213793  4.587069  2.640741  2.723166  2.001177   \n",
      "4  4.379448  2.120995  3.995127  3.978380  2.093292  3.199951  2.136889   \n",
      "\n",
      "       AAA1      AAAS      AACS  ...     ZWINT      ZXDA      ZXDB      ZXDC  \\\n",
      "0  1.823706  5.200664  4.337035  ...  3.772052  5.292323  3.622881  3.032975   \n",
      "1  1.910576  4.446920  3.584887  ...  3.844710  5.264879  3.984051  2.110187   \n",
      "2  2.076641  4.772986  3.416875  ...  3.535506  5.452824  4.346888  2.522582   \n",
      "3  2.099061  4.910108  3.435449  ...  3.465616  6.204885  4.168669  1.962237   \n",
      "4  1.781531  4.663655  4.109242  ...  4.081812  6.238755  3.779931  1.933819   \n",
      "\n",
      "     ZYG11A    ZYG11B       ZYX     ZZEF1      ZZZ3  label  \n",
      "0  3.007544  4.361043  4.514723  3.842147  4.964005      1  \n",
      "1  2.155425  4.085723  4.536928  3.086250  3.759388      0  \n",
      "2  1.791178  3.935233  4.479023  4.014399  4.006972      1  \n",
      "3  2.231639  3.380808  4.467080  4.351137  4.559030      1  \n",
      "4  2.529574  4.053361  4.477355  3.762320  4.306754      1  \n",
      "\n",
      "[5 rows x 20229 columns]\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 339 entries, 0 to 338\n",
      "Columns: 20229 entries, A1BG to label\n",
      "dtypes: float64(20228), int64(1)\n",
      "memory usage: 52.3 MB\n"
     ]
    }
   ],
   "source": [
    "tdf = pd.read_csv(\"./cancer_train_dataset.csv\")\n",
    "print(tdf.head())\n",
    "tdf.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bb94e887",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
